set.seed(895893)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(broom)
library(papaja)
## Loading required package: tinylabels
This document outlines using the suggested procedure to simulate the number of participants necessary to accurately measure items. There are two key issues these ideas should address that we know about power:
Population. The data was simulated using the
rnorm function assuming a normal distribution for 30 rating
scale type items rounded to two decimal places. The item means were
simulated as \(\mu\) = 4 points with a
\(\sigma\) of .25 points. The item
\(\sigma\)s were simulated as 2 points
with a low variability (\(\sigma\) =
.2), medium variability (.4), and high variability (.8). Each population
was simulated with 1000 data points.
# mu <- rnorm(30, 4, .25)
# sigma1 <- rnorm(30, 2, .2)
# sigma2 <- rnorm(30, 2, .4)
# sigma3 <- rnorm(30, 2, .8)
mu <- rnorm(30, 847.48, 150)
sigma1 <- rnorm(30, 350, 100)
sigma2 <- rnorm(30, 350, 150)
sigma3 <- rnorm(30, 350, 200)
population1 <- data.frame(
item = rep(1:30, 1000),
score = rnorm(1000*30, mean = mu, sd = sigma1)
)
population2 <- data.frame(
item = rep(1:30, 1000),
score = rnorm(1000*30, mean = mu, sd = sigma2)
)
population3 <- data.frame(
item = rep(1:30, 1000),
score = rnorm(1000*30, mean = mu, sd = sigma3)
)
#evidence that they are simulated correctly
tapply(population1$score, population1$item, mean) # around 4
## 1 2 3 4 5 6 7 8
## 765.6036 664.7785 725.2771 807.4432 538.3648 900.3546 910.2568 1052.8264
## 9 10 11 12 13 14 15 16
## 630.9712 832.6903 420.4910 729.7713 1002.6954 829.4935 860.7070 767.0554
## 17 18 19 20 21 22 23 24
## 833.1047 902.2030 1075.1021 861.4978 878.1431 1041.9373 855.7506 937.2122
## 25 26 27 28 29 30
## 679.9668 764.6669 972.6437 767.2876 703.4890 1134.4645
tapply(population1$score, population1$item, sd) # around 2
## 1 2 3 4 5 6 7 8
## 311.2448 360.3193 322.2985 338.7658 314.7784 287.8927 293.7953 420.0879
## 9 10 11 12 13 14 15 16
## 520.1255 225.5955 126.2305 260.6342 345.8144 235.1180 335.0544 378.1748
## 17 18 19 20 21 22 23 24
## 469.8720 449.9834 368.2411 272.1977 452.8620 242.6836 224.4292 368.2582
## 25 26 27 28 29 30
## 428.3773 276.0312 446.6273 347.6342 501.1615 132.2046
sd(tapply(population1$score, population1$item, sd)) # around .2
## [1] 99.311
tapply(population2$score, population2$item, mean) # around 4
## 1 2 3 4 5 6 7 8
## 725.6040 674.2798 721.9513 819.0289 567.8612 908.0273 909.3387 1017.5575
## 9 10 11 12 13 14 15 16
## 633.3741 832.8185 425.6334 734.1888 991.4534 826.5033 896.6126 765.6925
## 17 18 19 20 21 22 23 24
## 816.4628 908.4821 1063.4440 871.5552 910.0602 1032.7541 858.2532 913.4935
## 25 26 27 28 29 30
## 689.3946 770.0179 952.0898 780.5229 744.3489 1148.2740
tapply(population2$score, population2$item, sd) # around 2
## 1 2 3 4 5 6 7 8
## 628.7929 133.9378 180.8274 444.5191 534.7801 236.9617 374.8703 272.8363
## 9 10 11 12 13 14 15 16
## 533.0573 382.9958 510.5852 202.9281 384.3099 168.4936 486.3912 201.9486
## 17 18 19 20 21 22 23 24
## 510.6338 194.6759 361.3305 318.7460 290.1266 361.5759 382.1639 641.2955
## 25 26 27 28 29 30
## 427.6073 118.5997 572.2905 226.0813 319.1692 253.5639
sd(tapply(population2$score, population2$item, sd)) # around .4
## [1] 149.3125
tapply(population3$score, population3$item, mean) # around 4
## 1 2 3 4 5 6 7 8
## 750.3565 653.3462 720.5342 826.4865 561.6759 887.3865 914.7876 1015.7814
## 9 10 11 12 13 14 15 16
## 628.8596 811.8146 440.8774 720.1106 1003.1874 819.4390 873.3109 774.0752
## 17 18 19 20 21 22 23 24
## 831.6783 921.3816 1054.2702 873.8773 912.0118 1044.3531 847.7998 917.2589
## 25 26 27 28 29 30
## 676.2947 749.4620 938.1922 736.1093 708.5602 1133.4742
tapply(population3$score, population3$item, sd) # around 2
## 1 2 3 4 5 6 7
## 707.46915 521.55827 432.00590 413.38828 295.58972 327.81660 694.47902
## 8 9 10 11 12 13 14
## 333.32744 682.78489 373.40442 586.09688 716.46934 136.71397 429.01578
## 15 16 17 18 19 20 21
## 35.82413 243.38181 314.72626 509.47587 857.63676 63.15200 158.46518
## 22 23 24 25 26 27 28
## 437.23435 509.70987 1157.50955 276.46220 235.43245 538.14213 552.93501
## 29 30
## 324.61305 306.14171
sd(tapply(population3$score, population3$item, sd)) # around .8
## [1] 242.2529
Samples. Each population was then sampled as if a researcher was conducting a pilot study. The sample sizes started at 20 participants per item increasing in units of 5 up to 100 participants.
# create pilot samples from 20 to 100
samples1 <- samples2 <- samples3 <- list()
sizes <- c(20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100)
for (i in 1:length(sizes)){
samples1[[i]] <- population1 %>%
group_by(item) %>%
slice_sample(n = sizes[i])
samples2[[i]] <- population2 %>%
group_by(item) %>%
slice_sample(n = sizes[i])
samples3[[i]] <- population3 %>%
group_by(item) %>%
slice_sample(n = sizes[i])
}
AIPE Criterions. The standard errors of each item were calculated to mimic the AIPE procedure of finding an appropriately small confidence interval, as standard error functions as the main component in the formula for normal distribution confidence intervals. Standard errors were calculated at each decile of the items up to 90% (0% smallest SE, 10% …, 90% largest SE). The lower deciles would represent a strict criterion for accurate measurement, as many items would need smaller SEs to meet AIPE goals, while the higher deciles would represent less strict criterions for AIPE goals.
# calculate the SEs and the cutoff scores
SES1 <- SES2 <- SES3 <- list()
cutoffs1 <- cutoffs2 <- cutoffs3 <- list()
for (i in 1:length(samples1)){
SES1[[i]] <- tapply(samples1[[i]]$score,
samples1[[i]]$item,
function (x){ sd(x)/sqrt(length(x))})
SES2[[i]] <- tapply(samples2[[i]]$score,
samples2[[i]]$item,
function (x){ sd(x)/sqrt(length(x))})
SES3[[i]] <- tapply(samples3[[i]]$score,
samples3[[i]]$item,
function (x){ sd(x)/sqrt(length(x))})
cutoffs1[[i]] <- quantile(SES1[[i]], probs = seq(0, .9, by = .1))
cutoffs2[[i]] <- quantile(SES2[[i]], probs = seq(0, .9, by = .1))
cutoffs3[[i]] <- quantile(SES3[[i]], probs = seq(0, .9, by = .1))
}
In this section, we simulate what a researcher might do if they follow our suggested application of AIPE to sample size planning based on well measured items. Assuming each pilot sample represents a dataset a researcher has collected, we will simulate samples of 20 to 500 to determine what the new sample size suggestion would be. We assume that samples over 500 may be considered too large for many researchers who do not work in teams. The standard error of each item was calculated for each suggested sample size by pilot sample size by population type.
# sequence of sample sizes to try
samplesize_values <- seq(20, 1000, 5)
# place to store everything
sampled_values1 <- sampled_values2 <- sampled_values3 <- list()
# loop over the samples
for (i in 1:length(samples1)){
# create a blank table for us to save the values in
sim_table1 <- matrix(NA,
nrow = length(samplesize_values),
ncol = 30)
# make it a data frame
sim_table1 <- sim_table2 <- sim_table3 <- as.data.frame(sim_table1)
# add a place for sample size values
sim_table1$sample_size <- sim_table2$sample_size <- sim_table3$sample_size <- NA
# loop over pilot sample sizes
for (q in 1:length(samplesize_values)){
# temp dataframe that samples and summarizes
temp <- samples1[[i]] %>%
group_by(item) %>%
slice_sample(n = samplesize_values[q], replace = T) %>%
summarize(se = sd(score)/sqrt(length(score)))
sim_table1[q, 1:30] <- temp$se
sim_table1[q, 31] <- samplesize_values[q]
temp <- samples2[[i]] %>%
group_by(item) %>%
slice_sample(n = samplesize_values[q], replace = T) %>%
summarize(se = sd(score)/sqrt(length(score)))
sim_table2[q, 1:30] <- temp$se
sim_table2[q, 31] <- samplesize_values[q]
temp <- samples3[[i]] %>%
group_by(item) %>%
slice_sample(n = samplesize_values[q], replace = T) %>%
summarize(se = sd(score)/sqrt(length(score)))
sim_table3[q, 1:30] <- temp$se
sim_table3[q, 31] <- samplesize_values[q]
} # end pilot sample loop
sampled_values1[[i]] <- sim_table1
sampled_values2[[i]] <- sim_table2
sampled_values3[[i]] <- sim_table3
} # end all sample loop
Next, the percent of items that fall below the cutoff scores, and thus, would be considered “well-measured” were calculated for each decile by sample.
summary_list1 <- summary_list2 <- summary_list3 <- list()
for (i in 1:length(sampled_values1)){
summary_list1[[i]] <- sampled_values1[[i]] %>%
pivot_longer(cols = -c(sample_size)) %>%
rename(item = name, se = value) %>%
group_by(sample_size) %>%
summarize(Percent_Below0 = sum(se <= cutoffs1[[i]][1])/30,
Percent_Below10 = sum(se <= cutoffs1[[i]][2])/30,
Percent_Below20 = sum(se <= cutoffs1[[i]][3])/30,
Percent_Below30 = sum(se <= cutoffs1[[i]][4])/30,
Percent_Below40 = sum(se <= cutoffs1[[i]][5])/30,
Percent_Below50 = sum(se <= cutoffs1[[i]][6])/30,
Percent_Below60 = sum(se <= cutoffs1[[i]][7])/30,
Percent_Below70 = sum(se <= cutoffs1[[i]][8])/30,
Percent_Below80 = sum(se <= cutoffs1[[i]][9])/30,
Percent_Below90 = sum(se <= cutoffs1[[i]][10])/30) %>%
mutate(original_n = sizes[i])
summary_list2[[i]] <- sampled_values2[[i]] %>%
pivot_longer(cols = -c(sample_size)) %>%
rename(item = name, se = value) %>%
group_by(sample_size) %>%
summarize(Percent_Below0 = sum(se <= cutoffs2[[i]][1])/30,
Percent_Below10 = sum(se <= cutoffs2[[i]][2])/30,
Percent_Below20 = sum(se <= cutoffs2[[i]][3])/30,
Percent_Below30 = sum(se <= cutoffs2[[i]][4])/30,
Percent_Below40 = sum(se <= cutoffs2[[i]][5])/30,
Percent_Below50 = sum(se <= cutoffs2[[i]][6])/30,
Percent_Below60 = sum(se <= cutoffs2[[i]][7])/30,
Percent_Below70 = sum(se <= cutoffs2[[i]][8])/30,
Percent_Below80 = sum(se <= cutoffs2[[i]][9])/30,
Percent_Below90 = sum(se <= cutoffs2[[i]][10])/30) %>%
mutate(original_n = sizes[i])
summary_list3[[i]] <- sampled_values3[[i]] %>%
pivot_longer(cols = -c(sample_size)) %>%
rename(item = name, se = value) %>%
group_by(sample_size) %>%
summarize(Percent_Below0 = sum(se <= cutoffs3[[i]][1])/30,
Percent_Below10 = sum(se <= cutoffs3[[i]][2])/30,
Percent_Below20 = sum(se <= cutoffs3[[i]][3])/30,
Percent_Below30 = sum(se <= cutoffs3[[i]][4])/30,
Percent_Below40 = sum(se <= cutoffs3[[i]][5])/30,
Percent_Below50 = sum(se <= cutoffs3[[i]][6])/30,
Percent_Below60 = sum(se <= cutoffs3[[i]][7])/30,
Percent_Below70 = sum(se <= cutoffs3[[i]][8])/30,
Percent_Below80 = sum(se <= cutoffs3[[i]][9])/30,
Percent_Below90 = sum(se <= cutoffs3[[i]][10])/30) %>%
mutate(original_n = sizes[i])
}
summary_DF <- bind_rows(summary_list1,
summary_list2,
summary_list3)
summary_DF$source <- rep(c("low", "med", "high"), each = length(sampled_values1)*length(samplesize_values))
From this data, we pinpoint the smallest suggested sample size at which 80%, 85%, 90%, and 95% of the items fall below the cutoff criterion. These values were chosen as popular measures of “power” in which one could determine the minimum suggested sample size (potentially 80% of items) and the maximum suggested sample size (potentially 90%).
summary_long_80 <- summary_DF %>%
pivot_longer(cols = -c(sample_size, original_n, source)) %>%
filter(value >= .80) %>%
arrange(sample_size, original_n, source, name) %>%
group_by(original_n, name, source) %>%
slice_head(n = 1) %>%
mutate(power = 80)
summary_long_85 <- summary_DF %>%
pivot_longer(cols = -c(sample_size, original_n, source)) %>%
filter(value >= .85) %>%
arrange(sample_size, original_n, source, name) %>%
group_by(original_n, name, source) %>%
slice_head(n = 1) %>%
mutate(power = 85)
summary_long_90 <- summary_DF %>%
pivot_longer(cols = -c(sample_size, original_n, source)) %>%
filter(value >= .90) %>%
arrange(sample_size, original_n, source, name) %>%
group_by(original_n, name, source) %>%
slice_head(n = 1) %>%
mutate(power = 90)
summary_long_95 <- summary_DF %>%
pivot_longer(cols = -c(sample_size, original_n, source)) %>%
filter(value >= .95) %>%
arrange(sample_size, original_n, source, name) %>%
group_by(original_n, name, source) %>%
slice_head(n = 1) %>%
mutate(power = 95)
summary_long <- rbind(summary_long_80,
summary_long_85,
summary_long_90,
summary_long_95)
summary_long$source <- factor(summary_long$source,
levels = c("low", "med", "high"),
labels = c("Low", "Medium", "High"))
summary_long$sd_original <- .2
summary_long$sd_original[summary_long$source == "Medium"] <- .4
summary_long$sd_original[summary_long$source == "High"] <- .8
summary_long$name <- gsub("Percent_Below", "Decile ", summary_long$name)
We examined if this procedure is sensitive to differences in item heterogeneity, as we should expect to collect larger samples if we wish to have a large number of items reach a threshold of acceptable variance; potentially, assuring we could average them if a researcher did not wish to use a more complex analysis such as multilevel modeling.
The figure below illustrates the potential minimum sample size for 80% of items to achieve a desired cutoff score. The black dots denote the original sample size against the suggested sample size. By comparing the facets, we can determine that our suggested procedure does capture the differences in heterogeneity. As heterogeneity increases in item variances, the proposed sample size also increases, especially at stricter cutoffs. Missing cutoff points where sample sizes proposed would be higher than 500.
ggplot(summary_long %>% filter(power == 80), aes(original_n, sample_size, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ source) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
In our second question, we examined if the suggested procedure was sensitive to the amount of information present in the pilot data. Larger pilot data is more informative, and therefore, we should expect a lower projected sample size. As shown in the figure below for only the low variability data, we do not find this effect. These simplistic simulations from the pilot data would nearly always suggest a larger sample size - mostly in a linear trend increasing with sample sizes. This result comes from the nature of the procedure - if we base our estimates on some SE cutoff, we will almost always need a bit more people for items to meet those goals. This result does not achieve our second goal.
ggplot(summary_long %>% filter(source == "Low"), aes(original_n, sample_size, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
Therefore, we suggest using a correction factor on the simulation procedure to account for the known asymptotic nature of power (i.e., at larger sample sizes power increases level off). For this function, we combined a correction factor for upward biasing of effect sizes (Hedges’ correction) with the formula for exponential decay calculations. The decay factor is calculated as follows:
\[ 1 - \sqrt{\frac{N_{pilot} - min(N_{sim})}{N_{pilot}}}^{log_2(N_{pilot})}\] \(N_{pilot}\) indicates the sample size of the pilot data minus the minimum sample size for simulation to ensure that the smallest sample sizes do not decay (e.g., the formula zeroes out). This value is raised to the power of \(log_2\) of the sample size of the pilot data, which decreases the impact of the decay to smaller increments for increasing sample sizes. This value is then multiplied by the proposed sample size. As show in the figure below, this correction factor produces the dsired quality of maintaining that small pilot studies should increase sample size, and that sample size suggestions level off as pilot study data sample size increases.
decay <- 1-sqrt((summary_long$original_n-20)/summary_long$original_n)^log2(summary_long$original_n)
summary_long$new_sample <- summary_long$sample_size*decay
ggplot(summary_long %>% filter(source == "Low"), aes(original_n, new_sample, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
We have portrayed that this procedure, with a correction factor, can perform as desired. However, within real scenarios, researchers will only have one pilot sample, not many as show here. What should the researcher do to correct their sample size on their own pilot data?
To explore if we could recover the new suggested sample size from data a researcher would have, we used linear models to create a formula for calculation. First, the corrected sample size was predicted by the original suggested sample size. Next, the standard deviation of the item standard deviations was added to the equation. Last, we included the pilot sample size.
user_model <- lm(new_sample ~ sample_size, data = summary_long)
user_print <- apa_print(user_model)
user_model2 <- lm(new_sample ~ sample_size + sd_original, data = summary_long)
user_print2 <- apa_print(user_model2)
user_model3 <- lm(new_sample ~ sample_size + sd_original + original_n, data = summary_long)
user_print3 <- apa_print(user_model3)
change_table <- tidy(anova(user_model, user_model2, user_model3))
The first model using original sample size to predict new sample size was significant, \(R^2 = .92\), 90% CI \([0.92, 0.93]\), \(F(1, 1,869) = 22,165.05\), \(p < .001\), capturing nearly 90% of the variance. The second model with item standard deviation was better then the first model F(1, 1868) = 1.4457031, p < .001, \(R^2 = .92\), 90% CI \([0.92, 0.93]\). The addition of the original pilot sample size was also significant, F(1, 1867) = 1848.4067115, p < .001, \(R^2 = .96\), 90% CI \([0.96, 0.96]\).
As shown in the table below, the new suggested sample size is proportional to the original suggested sample size (i.e., b < 1), which reduces the sample size suggestion. As variability increases, the suggested sample size also increases to capture differences in heterogeneity shown above. Last, in order to correct for large pilot data, the original pilot sample size decreases the new suggested sample size. This formula approximation captures 96% of the variance in sample size scores and should allow a researcher to estimate based on their own data.
apa_table(tidy(user_model3))
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 65.19 | 2.00 | 32.65 | 0.00 |
| sample_size | 0.70 | 0.00 | 212.34 | 0.00 |
| sd_original | -0.83 | 2.55 | -0.33 | 0.74 |
| original_n | -1.14 | 0.03 | -42.99 | 0.00 |
by_cutoff <- list()
R2 <- list()
for (cutoff in unique(summary_long$name)){
by_cutoff[[cutoff]] <- lm(new_sample ~ sample_size + sd_original + original_n, data = summary_long %>% filter(name == cutoff))
R2[cutoff] <- summary(by_cutoff[[cutoff]])$r.squared
}
R2
## $`Decile 0`
## [1] 0.9812539
##
## $`Decile 10`
## [1] 0.9665002
##
## $`Decile 20`
## [1] 0.9727879
##
## $`Decile 30`
## [1] 0.9723443
##
## $`Decile 40`
## [1] 0.9685064
##
## $`Decile 50`
## [1] 0.9658231
##
## $`Decile 60`
## [1] 0.9675184
##
## $`Decile 70`
## [1] 0.9686524
##
## $`Decile 80`
## [1] 0.9681113
##
## $`Decile 90`
## [1] 0.9677101
Last, we examine the question of an appropriate SE decile. All graphs for power, variability, and correction are presented below. If we examine the \(R^2\) values for each decile of our regression equation separately, we find that the 10% (0.967) and 30% deciles (0.972) represent the best match to our corrected sample size suggestions. The 10% is likely unfeasible for many researchers, as the sample sizes are often quite large. The 30% decile, in the corrected format, appears to meet all goals: 1) increases with heterogeneity, 2) higher suggested values for lower samples and a leveling effect at larger pilot data, and …
ggplot(summary_long %>% filter(source == "Low"), aes(original_n, sample_size, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
ggplot(summary_long %>% filter(source == "Low"), aes(original_n, new_sample, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
ggplot(summary_long %>% filter(source == "Medium"), aes(original_n, sample_size, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
summary_long$new_sample <- summary_long$sample_size*decay
ggplot(summary_long %>% filter(source == "Medium"), aes(original_n, new_sample, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
ggplot(summary_long %>% filter(source == "High"), aes(original_n, sample_size, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
ggplot(summary_long %>% filter(source == "High"), aes(original_n, new_sample, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")